% estimation_sequential_industries

% This code estimates the model for each group of firms. 
% To make the code readable, below is a streamlined and annotated version of the code used in the paper.
% The code first produces the starting values of the parameters for the likelihood search, 
% then it estimates the model.
% 
% For each group of firms, the model is estimated using the following procedure:
% 1) Find starting values of the parameters
% 2) Estimate parameter values from the likelihood search
% 3) Check that the score process has approximately mean zero.
% 
% Starting values of the parameters are obtained as follows:
% a) muP and sigmaP from mean and std dev of the cross sectional average cash flows, i.e., their approximate sample counterparts
% b) alpha, rho, sigmaA from likelihood search (no simple sample counterparts exist).
% The coefficient alpha plays the role of P_1, which is normalized to one.
% That is, alpha scales the level of the latent process P_t and for computational purposes it is treated as a parameter.
% Parameter estimates are obtained from reparametrized likelihood search.
% The reparametrized likelihood merely ensures that the volatilities are positives, P_1 is positive, 
% and the correlation rho is between minus one and one.
% 
% The first part of the code is to prepare the unbalanced cash flow data to compute the starting values for the likelihood search. 
% As usual in numerical likelihood search, starting values are important. 
% The final parameter estimates need to be winsorized as explained in the paper.
%
% As a sanity check, running the code once as it is (i.e., without recomputing starting values, etc.), 
% should produce parameter values that are approximately in the ballpark of the estimates reported in the paper, at least the median values. 

%% Load cash flow panels indexed by 1, 2, etc. for all groups of firms (10 firms each group).
% COMPUSTAT cash flow panels are not provided because of copyright issues.
% Cash flow data and groups are constructed as explained in the paper.

%% Sample time length:
year_all = 1971:2018;
num_years_all = length(year_all);

%% Extract increasing group numbers: one value for each group
count_group = 1;
groups_by_cf_all = sort(groups_by_cf_all);
groups_by_cf_series(1) = groups_by_cf_all(1);
for ii = 2 : length(groups_by_cf_all)
    if groups_by_cf_all(ii) > groups_by_cf_all(ii-1)
        count_group = count_group + 1;
        groups_by_cf_series(count_group) = groups_by_cf_all(ii);
    end
end
groups_by_cf_series = groups_by_cf_series';
% figure, plot(groups_by_cf_series,'b.'), ylabel('SIC3 by CF')
% The graph above should be an increasing line.

group_series = groups_by_cf_series;
groups_all = groups_by_cf_all;
numGroups = length(group_series);

%% Initialize variables:
% Once variables are initialized, comment them to recompute the starting
% values or reestimate the parameters for a given group of firms
% (otherwise computed values will be overwritten with NaNs).

% Starting values:
parameters_series_mv = NaN(numGroups,5); % PARAMETERS = muP, sigmaP, alpha, sigmaA, rho 
loglike_series_mv = NaN(numGroups,1); % value of the log-likelihood
convergence_series_mv = NaN(numGroups,1); % exit flag: 1 convergence, 0 not-convergence

num_firms_series = NaN(numGroups,1);  % initialize
num_firms_kept_series = NaN(numGroups,1);

% Estimates:
parameters_series_all_mv = NaN(numGroups,5); % PARAMETERS = muP, sigmaP, alpha, sigmaA, rho 
loglike_series_all_mv = NaN(numGroups,1); % value of the log-likelihood
convergence_series_all_mv = NaN(numGroups,1); % exit flag: 1 convergence, 0 not-convergence


%% FOR loop over the groups of firms:
for thatGroup = 1 : numGroups
    % To recompute starting values or reestimate the model for a specific group of firms,
    % set thatGroup = that group in the FOR loop above.
    
    thatGroup
    %% Take one group:
    that_industry_group = group_series(thatGroup);
    that_industry_firms = find(that_industry_group == groups_all);
    that_industry_CF = CF_all(that_industry_firms,:); % CF for all firms in one group
    that_industry_gvkey = gvkey_all(that_industry_firms); % gvkey of all firms within that group
    Tobs = cols(that_industry_CF);
    numFirms = rows(that_industry_CF);
    num_firms_series(thatGroup) = numFirms;
    that_industry_NaN = find(isnan(that_industry_CF)); % positions of NaN's in the CF panel
    
    % To find starting values: treat raw cash flows (CF) as follows:
    % i) store the position of NaN's (missing values) 
    % ii) fill-in missing values, time series interpolate, and winsorize/trim CF
    % iii) put back NaN's in the same firm-year positions as in the original CF panel.
    
    % To recompute starting values, one possibility would be to discard firms with many missing values.
    % In the paper, we did not use this option, but the code allows this.
    % Below, no firms is discarded because the cut off is set to 1. Values of cut off < 1 discard firms.
    fraction_missing_CF_per_firm = NaN(numFirms,1);
    that_industry_CF_most_complete = NaN(1,Tobs); % initialize CF matrix
    % Recall, one row is CF for one firm
    that_industry_gvkey_most_complete = NaN; % initialize gvkey vector    
    for ii = 1 : numFirms
        that_firm_CF = that_industry_CF(ii,:);
        that_firm_gvkey = that_industry_gvkey(ii);
        fraction_missing_CF_per_firm(ii) = sum(isnan(that_firm_CF)) / Tobs;
        if fraction_missing_CF_per_firm(ii) <= 1 % = 1 -> means no firms are discarded
            that_industry_CF_most_complete = [that_industry_CF_most_complete; that_firm_CF];
            that_industry_gvkey_most_complete =  [that_industry_gvkey_most_complete; that_firm_gvkey]; % store that firm with not many missing values
        end
    end % END discard firms with many missing values (in the paper, no firms are discarded)
    that_industry_CF_most_complete = that_industry_CF_most_complete(2:end,:); % extracted CF panel, discard first row of NaN from initialization
    that_industry_gvkey_most_complete = that_industry_gvkey_most_complete(2:end); % extracted gvkey vector, discard first NaN from initialization
    num_firms_kept_series(thatGroup) = rows(that_industry_CF_most_complete);
    
    
    %% To find starting values: 
    % Interpolate cash flow data to fill in missing values.
    % Later in the code interpolated missing values will be removed and replaced by NaNs as in the original unbalanced CF panel.
    warning('off')
    numFirms_most_complete = rows(that_industry_CF_most_complete);
    that_industry_CF_most_complete_interpolated = NaN(numFirms_most_complete, Tobs);
    that_industry_CF_most_complete(find(isnan(that_industry_CF_most_complete(:,1))),1) = 0;
    that_industry_CF_most_complete(find(isnan(that_industry_CF_most_complete(:,2))),2) = 0;
    time_years = [1:Tobs]; % years from 1971 to 2018
    
    for ii = 1 : numFirms_most_complete
        that_firm_CF = that_industry_CF_most_complete(ii,:); % CF time series of one firm
        % plot(time_years', that_firm_CF', '-o')
        if isnan(that_firm_CF(end)) % = 1 when there is NaN at the end of the sample period
            slope_coeff = regress(that_firm_CF',[ones(Tobs,1), time_years']);
            that_firm_CF(end) = slope_coeff(1) + slope_coeff(2) * time_years(end); % compute fitted value at the end of sample
            that_industry_CF_most_complete(ii,end) = that_firm_CF(end); % -> replace last NaN obs with fitted value
            % figure, plot(time_years', that_industry_CF_most_complete(ii,:)', '-o')
        end
    end
        
    % Interpolate remaining missing values (no NaN at the end of sample period):
    for ii = 1 : numFirms_most_complete
        CF_to_interpolate = that_industry_CF_most_complete(ii,:); % CF to interpolate for a given firm
        CF_interpolated = interp1(time_years,CF_to_interpolate,time_years,'pchip','extrap');
        that_industry_CF_most_complete_interpolated(ii,:) = CF_interpolated;
    end % END interpolation
    
    if size(that_industry_CF_most_complete_interpolated) ~= size(that_industry_CF)
        error('Different sizes of that_industry_CF and that_industry_CF_most_complete_interpolated')
    end
	% figure, subplot(1,2,1), plot(that_industry_CF'), subplot(1,2,2), plot(that_industry_CF_most_complete_interpolated')

    %% To find starting values, winsorize cash flow data:
    % When necessary to recompute starting values, set winsor_cutoff to 3 or 3.5 instead of 4.
    that_industry_CF_final = NaN(numFirms_most_complete,Tobs);
    for tt = 1 : Tobs
        that_year_CF = that_industry_CF_most_complete_interpolated(:,tt); % for all firms, year t
        minCF = quantile(that_year_CF,0.1); 
        maxCF = quantile(that_year_CF,0.9);
        that_year_too_high_CF = find(that_year_CF > maxCF);
        that_year_too_low_CF = find(that_year_CF < minCF);
        that_year_CF(that_year_too_high_CF) = maxCF;
        that_year_CF(that_year_too_low_CF) = minCF;
        
        winsor_cutoff = 4; % this can be changed to find other starting values
        that_year_mean = median( that_year_CF );
        that_year_stdv = std( that_year_CF );
        that_year_too_high_CF = find(that_year_CF > that_year_mean + winsor_cutoff * that_year_stdv);
        that_year_too_low_CF = find(that_year_CF < that_year_mean - winsor_cutoff * that_year_stdv);
        that_year_CF(that_year_too_high_CF) = that_year_mean + winsor_cutoff * that_year_stdv;
        that_year_CF(that_year_too_low_CF) = that_year_mean - winsor_cutoff * that_year_stdv;
        
        that_industry_CF_final(:,tt) = that_year_CF; % cash flows are winsorized
    end % END winsorize
    % figure, plot(that_industry_CF_final')
    
    that_industry_CF_final_mv = that_industry_CF_final; % to re-allocate CF values, to possibly winsorized data
    that_industry_CF_final_mv(that_industry_NaN) = NaN; % replace interpolated values by NaN
    % figure, plot(that_industry_CF_final_mv')

    
    %% Starting values based on CF panel with NaN's (missing values):
    data_mv = that_industry_CF_final_mv; % panel of cash flows with NaN's
    data_mv(isnan(data_mv)) = 10^10; % replace NaN by large number, 10^10, just to perform matrix calcuations in Matlab. 
    % The large numbers above are eliminated by the S_t matrix in the generalized Kalman filter.
	data_zeroone = 1 - isnan(that_industry_CF_final_mv); % i.e., auxiliary matrix: zero in correspondence of a NaN in data_mv

    % Find starting and ending point of CF panel:
    sample4estimation_start_end = sum(~isnan(that_industry_CF_final_mv),1) >= 1;
    T1 = min( find(sample4estimation_start_end == 1) );
    Tend = max( find(sample4estimation_start_end == 1) );
        
    Tobs_mv = cols( data_mv(:,T1:Tend) );
	nFirms = rows(that_industry_CF_final_mv);
    
    CFavg_mv = nanmean(that_industry_CF_final_mv,1); % average CF across firms, for each year
	if CFavg_mv(T1) <= 0 % not allowed
        CFavg_mv(T1) = 0.01;
    end
    rateCFavg_mv = (CFavg_mv(2:end) - CFavg_mv(1:end-1))./abs(CFavg_mv(1:end-1));
    rateCFavg_mv(isinf(rateCFavg_mv)) = NaN; % replace Inf by NaN
    
    % The following lines of code computes the starting values of the parameters. 
    % These values work for some but not all groups of firms.
    % When necessary to recompute starting values for the likelihood search, use the following: 
    % a) cut off point of trimmed mean equal to 10 or 20 instead of 15
    % b) pre-starting value of alpha equal to 0.1 or 0.3 intead of 0.2
    % c) pre-starting value of sigmaA equal to 0.2 or 0.3 instead of 0.1
    % d) pre-starting value of rho equal to -0.2, -0.1, 0.1 or 0.2 instead of 0. 
    
    % Starting values of muP and sigmaP:
    muP_start_mv = trimmean( rateCFavg_mv, 15);
	sigmaP_start_mv = sqrt( trimmean( ( rateCFavg_mv - muP_start_mv).^2, 15 ) );

    alpha_start_mv = 0.2; % pre-startig value
    sigmaA_start_mv = 0.1; % pre-startig value
    rho_start_mv = 0; % pre-startig value
    dt = 1; % time step one year
    Xhat1_mv = 1; % P_1 normalized to 1
    
    % Find starting values of alpha, sigmaA, rho, via reparametried likelihood search:
    param_start_alpha_sigmaA_rho_mv = [alpha_start_mv, sigmaA_start_mv, rho_start_mv];
    [param_estimate_reparam_alpha_sigmaA_rho_mv, minusloglike_estimate_mv, exit_flag_mv] = fminsearch(@(param) loglike_model_cash_flow_reparam_alpha_sigmaA_rho_mv(param,data_mv(:,T1:Tend),data_zeroone(:,T1:Tend),Tobs_mv,nFirms,dt,Xhat1_mv,muP_start_mv,sigmaP_start_mv),param_start_alpha_sigmaA_rho_mv);
    param_estimate_mv = [muP_start_mv, sigmaP_start_mv, param_convert_reparam_true_alpha_sigmaA_rho(param_estimate_reparam_alpha_sigmaA_rho_mv)];
    % For this likelihood search, use fminsearch even if it is slow.
    
    parameters_series_mv(thatGroup,:) = param_estimate_mv; % store starting values
    loglike_series_mv(thatGroup) = - minusloglike_estimate_mv;
    convergence_series_mv(thatGroup) = exit_flag_mv;
    
	disp('Current starting values:')
    disp(param_estimate_mv)
    disp('Convergence exit flag:')
    disp(exit_flag_mv)
 
	CFavg_mv = nanmean(that_industry_CF_final_mv,1); % average CF across firms, for each year
    if CFavg_mv(T1) <= 0 % not allowed
        CFavg_mv(T1) = 0.01;
    end
	Xhat1_mv = CFavg_mv(T1) / (param_estimate_mv(3)*dt); % improved calculation of P_1
    
    
    %% Check that the vector valued score process has approximately zero mean:
    loglike_ts_mv = @(param) loglike_model_cash_flow_timeseries_mv(param,data_mv,data_zeroone,Tobs,nFirms,dt,Xhat1_mv)'; % (Tobs-1) x 1
    score_ts_mv = gradp( loglike_ts_mv, param_estimate_mv' ); % (Tobs-1) x 5
    score_mean_mv = mean(score_ts_mv,1); % check that it is approximately zero
    disp('Average score: should be approximately zero')
    disp(score_mean_mv)
    % figure(2000+thatGroup), plot(score_ts_mv) % useful to plot the score process

    
    %% Estimation:
    % After starting values are computed, run the remaning part of the code to estimate the model via likelihood search.
    % If starting values are accurate, a few rounds of likelihood search are typically enough to estimate the model. 
    % If the likelihood search does not converge or converge to unreasonable values, change starting values as indicated above.
    % Uncomment the remaining part of the code to obtain model estimates after starting values are computed:
    
%     param_estimate_mv = parameters_series_mv(thatGroup,:); % starting values
%     param_start_all = param_convert_true_reparam(param_estimate_mv);
% 	  CFavg_mv = nanmean(that_industry_CF_final_mv,1); % average CF across firms, for each year
%     if CFavg_mv(T1) <= 0 % not allowed
%         CFavg_mv(T1) = 0.01;
%     end
% 	  Xhat1_mv = CFavg_mv(T1) / (param_estimate_mv(3)*dt); % improved calculation of P_1
%     options = optimset('TolFun',1e-1,'TolX',1e-1);
%     [param_estimate_all_reparam, minusloglike_estimate_all_mv, exit_flag_all_mv] = fminunc(@(param) loglike_model_cash_flow_reparam_mv(param,data_mv(:,T1:Tend),data_zeroone(:,T1:Tend),Tobs_mv,nFirms,dt,Xhat1_mv),param_start_all,options);
%     param_estimate_all_mv = param_convert_reparam_true(param_estimate_all_reparam);
%     disp('Current parameter estimates:')
%     disp(param_estimate_all_mv)
%     
%     parameters_series_all_mv(thatGroup,:) = param_estimate_all_mv; % muP, sigmaP, alpha, sigmaA, rho 
%     loglike_series_all_mv(thatGroup) = - minusloglike_estimate_all_mv; % value of the log-likelihood
%     convergence_series_all_mv(thatGroup) = exit_flag_all_mv; % exit flag
%     
%     % Check that the vector valued score process has approximately zero mean:
%     loglike_ts_all_mv = @(param) loglike_model_cash_flow_timeseries_mv(param,data_mv,data_zeroone,Tobs,nFirms,dt,Xhat1_mv)'; % (Tobs-1) x 1
%     score_ts_all_mv = gradp( loglike_ts_all_mv, param_estimate_all_mv' ); % (Tobs-1) x 5
%     score_mean_all_mv = mean(score_ts_all_mv,1); % check that it is approximately zero
%     disp('Average score: should be approximately zero')
%     disp(score_mean_all_mv)
    
end

